Mining, Classification, Prediction

McKenzie Castro mnc927

Introduction

Paragraph or two introducing your datasets and variables, why they are interesting to you, etc. See instructions for more information

library(tidyverse)
library(fivethirtyeight)
library(fivethirtyeightdata)
drivers <- read_csv("/stor/home/mnc927/data/bad-drivers/bad-drivers.csv")
glimpse(drivers)
## Rows: 51
## Columns: 8
## $ State                                                                                                    <chr> …
## $ `Number of drivers involved in fatal collisions per billion miles`                                       <dbl> …
## $ `Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding`                                   <dbl> …
## $ `Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired`                           <dbl> …
## $ `Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted`                             <dbl> …
## $ `Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents` <dbl> …
## $ `Car Insurance Premiums ($)`                                                                             <dbl> …
## $ `Losses incurred by insurance companies for collisions per insured driver ($)`                           <dbl> …
drivers2 <- drivers %>% mutate(Region = fct_collapse(State, Eastern = c("Connecticut", 
    "Maine", "Massachusetts", "New Hampshire", "Rhode Island", 
    "Vermont", "New Jersey", "New York", "Pennsylvania", "Illinois", 
    "Indiana", "Michigan", "Ohio", "Wisconsin", "Iowa", "Kansas", 
    "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota", 
    "Delaware", "Florida", "Georgia", "Maryland", "District of Columbia", 
    "North Carolina", "South Carolina", "Virginia", "West Virginia", 
    "Alabama", "Kentucky", "Mississippi", "Tennessee", "Arkansas", 
    "Louisiana", "Oklahoma", "Texas"), Western = c("Arizona", 
    "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", 
    "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington")))

library(dplyr)
drivers2 <- rename(drivers2, FatalCollisions = "Number of drivers involved in fatal collisions per billion miles")
drivers2 <- rename(drivers2, Speeding = "Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding")
drivers2 <- rename(drivers2, Alcohol = "Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired")
drivers2 <- rename(drivers2, NotDistracted = "Percentage Of Drivers Involved In Fatal Collisions Who Were Not Distracted")
drivers2 <- rename(drivers2, NoAccidents = "Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been Involved In Any Previous Accidents")
drivers2 <- rename(drivers2, Losses = "Losses incurred by insurance companies for collisions per insured driver ($)")
drivers2 <- rename(drivers2, Premium = "Car Insurance Premiums ($)")
head(drivers2)
## # A tibble: 6 x 9
##   State FatalCollisions Speeding Alcohol NotDistracted NoAccidents Premium
##   <chr>           <dbl>    <dbl>   <dbl>         <dbl>       <dbl>   <dbl>
## 1 Alab…            18.8       39      30            96          80    785.
## 2 Alas…            18.1       41      25            90          94   1053.
## 3 Ariz…            18.6       35      28            84          96    899.
## 4 Arka…            22.4       18      26            94          95    827.
## 5 Cali…            12         35      28            91          89    878.
## 6 Colo…            13.6       37      28            79          95    836.
## # … with 2 more variables: Losses <dbl>, Region <fct>

The dataset I am using consists of information on which U.S. state has the worst drivers based on number of drivers involved in fatal collisions per billion miles, percentage of drivers involved in fatal collisions who were speeding, alcohol-impaired, not distracted, and not involved in previous accidents, as well as car insurance premium and losses incured by insurance companies for collisions per insured driver. I found the data using ‘fivethirtyeight’. The variables are measuring percentage of drivers involved in fatal collisions under various cirumstances, as well as car insurance premiums and losses in dollar amount. There are 51 observations, for all fifty states plus Washington, D.C. I created a binary variable called Region that groups states into two regions, Eastern and Western. There are thirty eight observations in the Eastern group and thirteen for Western. Eastern is equal to 1 and Western is equal to 0.

Cluster Analysis

library(cluster)
clust_dat <- drivers2 %>% dplyr::select(Speeding, Alcohol)

library(cluster)
sil_width <- vector()
for (i in 2:10) {
    kms <- kmeans(clust_dat, centers = i)
    sil <- silhouette(kms$cluster, dist(clust_dat))
    sil_width[i] <- mean(sil[, 3])
}

pam1 <- clust_dat %>% pam(k = 2)
pam1
## Medoids:
##      ID Speeding Alcohol
## [1,] 29       37      32
## [2,] 10       21      29
## Clustering vector:
##  [1] 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 1 2 2 1 1
## [39] 1 1 1 1 2 1 1 1 2 1 1 1 1
## Objective function:
##    build     swap 
## 5.810796 5.717403 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
pamclust <- clust_dat %>% mutate(cluster = as.factor(pam1$clustering))
pamclust %>% ggplot(aes(Speeding, Alcohol, color = cluster)) + 
    geom_point()

pamclust %>% group_by(cluster) %>% summarize_if(is.numeric, mean, 
    na.rm = T)
## # A tibble: 2 x 3
##   cluster Speeding Alcohol
##   <fct>      <dbl>   <dbl>
## 1 1           37.8    31.5
## 2 2           20.6    29.2
pam1$silinfo$avg.width
## [1] 0.5327257
plot(pam1, which = 2)

pam_dat <- drivers2 %>% select(Speeding, Alcohol)
sil_width <- vector()
for (i in 2:10) {
    pam_fit <- pam(pam_dat, k = i)
    sil_width[i] <- pam_fit$silinfo$avg.width
}
ggplot() + geom_line(aes(x = 1:10, y = sil_width)) + scale_x_continuous(name = "k", 
    breaks = 1:10)

final1 <- drivers2 %>% select(-Region) %>% select(-State) %>% 
    scale %>% as.data.frame
pam2 <- final1 %>% pam(2)

final1 <- final1 %>% mutate(cluster = as.factor(pam2$clustering))
ggplot(final1, aes(x = Speeding, y = Alcohol, color = cluster)) + 
    geom_point()

library(plotly)
final1 %>% plot_ly(x = ~Speeding, y = ~Alcohol, z = ~NotDistracted, 
    color = ~cluster, type = "scatter3d", mode = "markers")
library(GGally)
ggpairs(final1, aes(color = cluster))

Using PAM to find silhouette width, I decided to use two clusters, as two had the highest silhouette width. In terms of the original variables and observations, cluster one represents the higher values for Percent Speeding and Percent Alcohol, and cluster two represents the lower values for the same variables. The variable that shows the greatest difference between the two clusters is Percent Speeding. The variable that shows the least difference between the two clusters is Percent Not Distracted. The cluster solution is reasonable in terms of average silhouette width, as the original solution had a silhoutte width of 0.53.

Dimensionality Reduction with PCA

drivers3 <- drivers2 %>% select(-Region) %>% select(-State)
driver_nums <- drivers3 %>% select_if(is.numeric) %>% scale

drivers_pca <- princomp(driver_nums)
summary(drivers_pca, loadings = T)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.3037496 1.1776787 1.0250007 1.0086385 0.9217813
## Proportion of Variance 0.2476798 0.2020951 0.1530913 0.1482427 0.1238106
## Cumulative Proportion  0.2476798 0.4497748 0.6028661 0.7511088 0.8749194
##                            Comp.6     Comp.7
## Standard deviation     0.72955099 0.57109700
## Proportion of Variance 0.07755565 0.04752497
## Cumulative Proportion  0.95247503 1.00000000
## 
## Loadings:
##                 Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## FatalCollisions  0.280         0.625  0.401  0.516  0.233  0.219
## Speeding         0.144  0.467        -0.691         0.519       
## Alcohol          0.305  0.539  0.366        -0.298 -0.605 -0.169
## NotDistracted    0.144  0.363 -0.574  0.103  0.659 -0.250 -0.106
## NoAccidents     -0.242 -0.381  0.323 -0.541  0.445 -0.431 -0.123
## Premium         -0.607  0.367                      -0.154  0.687
## Losses          -0.601  0.278  0.188  0.238         0.196 -0.652
eigval <- drivers_pca$sdev^2
varprop = round(eigval/sum(eigval), 2)
ggplot() + geom_bar(aes(y = varprop, x = 1:7), stat = "identity") + 
    xlab("") + geom_path(aes(y = varprop, x = 1:7)) + geom_text(aes(x = 1:7, 
    y = varprop, label = round(varprop, 2)), vjust = 1, col = "white", 
    size = 4) + scale_y_continuous(breaks = seq(0, 0.6, 0.2), 
    labels = scales::percent) + scale_x_continuous(breaks = 1:10)

round(cumsum(eigval)/sum(eigval), 2)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 
##   0.25   0.45   0.60   0.75   0.87   0.95   1.00
eigval
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7 
## 1.6997630 1.3869270 1.0506263 1.0173516 0.8496807 0.5322447 0.3261518
library(factoextra)
fviz_pca_biplot(drivers_pca)

I retained four PC values.75% of the variance in the datasets is explained by these four PCs. Scoring high one PC1 indicates high scores on Percent Speeding, Alcohol, and Not Distracted, but low scores on Percent No Accidents, Premiums, and Losses. The opposite is true when scoring low on PC1. Scoring high on PC2 means high values on Percent Speeding, Alcohol, Not Distracted, Premium, and Losses, but low values on Percent No Accidents, and scoring low on PC2 means low values on the same variables, and scoring high on Percent No Accidents. Scoring high on PC3 means low Percent Not Distracted, and high Fatal Collisions, Percent Alcohol, Percent No Accidents, Premium, and Losses. Scoring low on PC3 means high Percent Not Distracted and low values for Fatal Collisions, Percent Alcohol, Percent No Accidents, Premium, and Losses. PC4 is a Percent Speeding versus Percent No Accidents axis. Scoring high on PC4 means high values for Fatal Collisions, Percent Not Distracted, and Losses, but low values for Percent Speeding and Percent No Accidents. Scoring low on PC4 means low values for Fatal Collisions, Percent Not Distracted, and Losses, but high values for Percent Speeding and Percent No Accidents.

Linear Classifier

drivers3 <- drivers2 %>% select(-State)
fit1 <- glm(Region == "Western" ~ ., data = drivers3, family = "binomial")
score <- predict(fit1, type = "response")
score %>% round(3)
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 0.019 0.861 0.915 0.026 0.015 0.537 0.011 0.636 0.433 0.020 0.010 0.865 0.985 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## 0.216 0.227 0.007 0.040 0.001 0.077 0.487 0.039 0.000 0.000 0.005 0.169 0.064 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## 0.587 0.002 0.586 0.052 0.000 0.353 0.003 0.053 0.008 0.002 0.023 0.732 0.458 
##    40    41    42    43    44    45    46    47    48    49    50    51 
## 0.001 0.059 0.187 0.000 0.034 0.991 0.395 0.001 0.453 0.055 0.567 0.732
probs <- predict(fit1, type = "response")
class_diag(probs, drivers3$Region, positive = "Western")
##           acc   sens   spec    ppv  f1     ba    auc
## Metrics 0.902 0.7692 0.9474 0.8333 0.8 0.8583 0.9251
table(truth = drivers3$Region, predictions = probs > 0.5)
##          predictions
## truth     FALSE TRUE
##   Eastern    36    2
##   Western     3   10
drivers5 <- drivers2 %>% select(-State)
k = 5
data <- drivers5[sample(nrow(drivers5)), ]
folds <- cut(seq(1:nrow(drivers5)), breaks = k, labels = F)
diags <- NULL
for (i in 1:k) {
    train <- data[folds != i, ]
    test <- data[folds == i, ]
    truth <- test$Region
    fit <- glm(Region ~ ., data = train, family = "binomial")
    probs <- predict(fit, newdata = test, type = "response")
    diags <- rbind(diags, class_diag(probs, truth, positive = "Western"))
}
summarize_all(diags, mean)
##       acc    sens    spec  ppv  f1      ba     auc
## 1 0.78546 0.42666 0.84064 0.48 NaN 0.63364 0.70774

Discussion here

Non-Parametric Classifier

library(caret)
knn_fit <- knn3(factor(Region == "Western", levels = c("TRUE", 
    "FALSE")) ~ Speeding + Alcohol + NoAccidents + NotDistracted + 
    FatalCollisions + Premium + Losses, data = drivers5, k = 5)
y_hat_knn <- predict(knn_fit, drivers5)
y_hat_knn
##       TRUE FALSE
##  [1,]  0.0   1.0
##  [2,]  0.4   0.6
##  [3,]  0.6   0.4
##  [4,]  0.4   0.6
##  [5,]  0.2   0.8
##  [6,]  0.4   0.6
##  [7,]  0.4   0.6
##  [8,]  0.0   1.0
##  [9,]  0.0   1.0
## [10,]  0.0   1.0
## [11,]  0.4   0.6
## [12,]  0.8   0.2
## [13,]  0.2   0.8
## [14,]  0.2   0.8
## [15,]  0.0   1.0
## [16,]  0.2   0.8
## [17,]  0.2   0.8
## [18,]  0.6   0.4
## [19,]  0.0   1.0
## [20,]  0.2   0.8
## [21,]  0.4   0.6
## [22,]  0.4   0.6
## [23,]  0.0   1.0
## [24,]  0.0   1.0
## [25,]  0.4   0.6
## [26,]  0.2   0.8
## [27,]  0.8   0.2
## [28,]  0.0   1.0
## [29,]  0.4   0.6
## [30,]  0.0   1.0
## [31,]  0.0   1.0
## [32,]  0.8   0.2
## [33,]  0.0   1.0
## [34,]  0.0   1.0
## [35,]  0.0   1.0
## [36,]  0.0   1.0
## [37,]  0.2   0.8
## [38,]  0.8   0.2
## [39,]  0.4   0.6
## [40,]  0.0   1.0
## [41,]  0.6   0.4
## [42,]  0.2   0.8
## [43,]  0.0   1.0
## [44,]  0.4   0.6
## [45,]  0.8   0.2
## [46,]  0.0   1.0
## [47,]  0.0   1.0
## [48,]  0.8   0.2
## [49,]  0.4   0.6
## [50,]  0.2   0.8
##  [ reached getOption("max.print") -- omitted 1 row ]
table(truth = factor(drivers5$Region == "Western", levels = c("TRUE", 
    "FALSE")), prediction = factor(y_hat_knn[, 1] > 0.5, levels = c("TRUE", 
    "FALSE")))
##        prediction
## truth   TRUE FALSE
##   TRUE     7     6
##   FALSE    2    36
class_diag(y_hat_knn[, 1], drivers2$Region, positive = "Western")
##            acc   sens   spec    ppv     f1     ba    auc
## Metrics 0.8431 0.5385 0.9474 0.7778 0.6364 0.7429 0.8674
set.seed(1234)
drivers5 <- drivers2 %>% select(-State)
k = 5
data <- drivers5[sample(nrow(drivers5)), ]
folds <- cut(seq(1:nrow(drivers5)), breaks = k, labels = F)
diags <- NULL
for (i in 1:k) {
    train <- data[folds != i, ]
    test <- data[folds == i, ]
    truth <- test$Region
    fit <- knn3(Region ~ ., data = train)
    probs <- predict(fit, newdata = test)[, 2]
    diags <- rbind(diags, class_diag(probs, truth, positive = "Western"))
}
summarize_all(diags, mean)
##       acc sens    spec  ppv  f1     ba     auc
## 1 0.80182 0.58 0.89564 0.62 NaN 0.7378 0.78532
fit2 <- glm(Region ~ ., data = drivers5, family = "binomial")
coef(fit2)
##     (Intercept) FatalCollisions        Speeding         Alcohol   NotDistracted 
##   -1.815289e+01    1.159003e-01    2.234030e-01   -1.299457e-01   -4.130500e-02 
##     NoAccidents         Premium          Losses 
##    2.540695e-01    7.987524e-04   -6.749379e-02
probs3 <- predict(fit2, type = "response")
class_diag(probs3, drivers5$Region, positive = "Western")
##           acc   sens   spec    ppv  f1     ba    auc
## Metrics 0.902 0.7692 0.9474 0.8333 0.8 0.8583 0.9251

The original calculated AUC value of 0.8674 is considered a good value. The model is predicting new observations ‘good’ per the CV AUC value of 0.78532. Because the model did do worse in CV AUC as the value dropped, that is a sign of overfitting. The non-parametric model did worse than the linear model in terms of its cross-validation performance because the non-parametric models CV AUC score is lower than the linear models, but the linear models CV AUC score did drop more than the non-parametric models.

Regression/Numeric Prediction

fit <- lm(FatalCollisions ~ ., data = drivers2)
yhat1 <- predict(fit)
mean((drivers2$FatalCollisions - yhat1)^2)
## [1] 2.21871e-28
set.seed(1234)
drivers5 <- drivers2 %>% select(-State)
k = 5
data <- drivers5[sample(nrow(drivers5)), ]
folds <- cut(seq(1:nrow(drivers5)), breaks = k, labels = F)
diags <- NULL
for (i in 1:k) {
    train <- data[folds != i, ]
    test <- data[folds == i, ]
    fit <- lm(FatalCollisions ~ ., data = train)
    yhat <- predict(fit, newdata = test)
    diags <- mean((test$FatalCollisions - yhat)^2)
}
mean(diags)
## [1] 16.50434

This model does show signs of overfitting, as the MSE value is higher in CV. Across the k testing folds, the overall mean squared error is equal to 16.50434.

Python

library(reticulate)
drivers8 <- fivethirtyeight::bad_drivers
head(py$df2)
## NULL
df=r.drivers8
df[df["num_drivers"]>20.0]
##              state  num_drivers   ...    insurance_premiums  losses
## 3         Arkansas         22.4   ...                827.34  142.39
## 17        Kentucky         21.4   ...                872.51  137.13
## 18       Louisiana         20.5   ...               1281.55  194.78
## 26         Montana         21.4   ...                816.21   85.15
## 34    North Dakota         23.9   ...                688.75  109.72
## 40  South Carolina         23.9   ...                858.97  116.29
## 48   West Virginia         23.8   ...                992.61  152.56
## 
## [7 rows x 8 columns]
df2=df[df["num_drivers"]>20.0]

Using reticulate, I first shared my drivers dataset between R and Python using r. I narrowed down the dataframe in python to only include states that had over 20.0 for the variable that measures the number of drivers involved in fatal collisions per billion miles. I then shared that object I created with R using py$.